Importing the dataset

The data is from Kaggle and includes

# Original URL: https://www.kaggle.com/datasets/yasserh/walmart-dataset/data

store_data = read.csv("/Users/delracman/Documents/STAT 7500/Project/Walmart.csv") %>% 
  mutate(
    Date=as.Date(Date,format="%d-%m-%Y"),
    Year=year(Date),
    Month=month(Date),
    Day=day(Date),
    Week=week(Date)
  ) %>% 
  arrange(Date)
head(store_data)
##   Store       Date Weekly_Sales Holiday_Flag Temperature Fuel_Price      CPI
## 1     1 2010-02-05    1643690.9            0       42.31      2.572 211.0964
## 2     2 2010-02-05    2136989.5            0       40.19      2.572 210.7526
## 3     3 2010-02-05     461622.2            0       45.71      2.572 214.4249
## 4     4 2010-02-05    2135143.9            0       43.76      2.598 126.4421
## 5     5 2010-02-05     317173.1            0       39.70      2.572 211.6540
## 6     6 2010-02-05    1652635.1            0       40.43      2.572 212.6224
##   Unemployment Year Month Day Week
## 1        8.106 2010     2   5    6
## 2        8.324 2010     2   5    6
## 3        7.368 2010     2   5    6
## 4        8.623 2010     2   5    6
## 5        6.566 2010     2   5    6
## 6        7.259 2010     2   5    6

Sales Per Week Across All Stores

There are peaks at similar times each year.

store_data %>% 
  group_by(Date) %>% 
  summarize(Total_Sales=sum(Weekly_Sales)) %>% 
  ggplot() + 
    geom_line(aes(x=Date,y=Total_Sales)) +
    labs(title="Total Sales Each Week") +
    scale_x_date(breaks = function(x) seq.Date(from = as.Date("2010-01-01"), 
                                                 to = as.Date("2012-12-31"), 
                                                 by = "4 months"))

# Think about way to replace axis labels with months instead of weeks 
# (since beginning of the week corresponding with each number changes each year)
avgsales = store_data %>% 
  group_by(Year,Week) %>% 
  summarize(
    avgsales=mean(Weekly_Sales),
    scaled=avgsales/100000
  )
avgsales %>% 
  ggplot() + 
    geom_line(aes(x=Week,y=scaled,color=factor(Year))) +
    labs(
      x="Week Number",
      y="Average Weekly Sales (in hundreds of thousands)",
      title="Average Weekly Sales Per Year Across All Stores"
    ) +
    scale_x_continuous(breaks = seq(1,53,by=4))

store_data %>% 
  group_by(Year,Week) %>% 
  mutate(
    avgsales=mean(Weekly_Sales),
    scaled=avgsales/100000,
    plot_date=as.Date(paste(2013,Month,Day,sep="-"),format="%Y-%m-%d")
  ) %>% 
  ggplot() + 
    geom_line(aes(x=plot_date,y=scaled,color=factor(Year))) + 
    labs(
        x="Month-Day",
        y="Average Weekly Sales (in hundreds of thousands)",
        title="Average Weekly Sales Per Year Across All Stores"
      ) +
    scale_x_date(date_labels="%m-%d")

# Add other holidays to the store_data. Flag with "2" to differentiate from pre-existing holidays in the store_dataset
# Observation: dates are for end of week, not beginning of week (so take date and check week previous, not week after)
# Also vectorized to be able to use with ifelse
holidays = c(
  "2010-05-31", "2011-05-30", "2012-05-28",
  "2010-04-04", "2011-04-24", "2012-04-08",
  "2010-10-31", "2011-10-31", "2012-10-31",
  "2010-07-04", "2011-07-04", "2012-07-04",
  "2010-02-14", "2011-02-14", "2012-02-14"
)
determine_holiday = function(day) {
  for (holiday in holidays) {
    if(between(as.numeric(as.Date(holiday,format="%Y-%m-%d")-day), -6, 0)) {
      return(TRUE)
    }
  }
  return(FALSE)
}

# Allows determine_holiday to take in one date at a time in the ifelse
determine_holiday_vectorized <- Vectorize(determine_holiday)

store_data_with_holidays = store_data %>%
  mutate(
    Holiday_Flag = ifelse(Holiday_Flag==0, ifelse(determine_holiday_vectorized(Date),2,Holiday_Flag),Holiday_Flag)
  )

head(store_data_with_holidays %>% filter(Holiday_Flag==2))
##   Store       Date Weekly_Sales Holiday_Flag Temperature Fuel_Price      CPI
## 1     1 2010-02-19    1611968.2            2       39.93      2.514 211.2891
## 2     2 2010-02-19    2124451.5            2       39.69      2.514 210.9452
## 3     3 2010-02-19     421642.2            2       47.07      2.514 214.6199
## 4     4 2010-02-19    2049860.3            2       36.45      2.540 126.5263
## 5     5 2010-02-19     303447.6            2       41.14      2.514 211.8471
## 6     6 2010-02-19    1567138.1            2       43.58      2.514 212.8162
##   Unemployment Year Month Day Week
## 1        8.106 2010     2  19    8
## 2        8.324 2010     2  19    8
## 3        7.368 2010     2  19    8
## 4        8.623 2010     2  19    8
## 5        6.566 2010     2  19    8
## 6        7.259 2010     2  19    8
# 6, 14, 23, 27, 40, 45, 47, 48, 49, 51, 52
store_data_with_holidays %>% 
  filter(Week %in% c(6,14,23,27,40,45,47,48,49,51,52) | Holiday_Flag %in% c(1,2)) %>% 
  select(Date,Week,Holiday_Flag) %>% 
  unique() %>% 
  select(Date,Week,Holiday_Flag)
##            Date Week Holiday_Flag
## 1    2010-02-05    6            0
## 46   2010-02-12    7            1
## 91   2010-02-19    8            2
## 136  2010-04-02   14            0
## 181  2010-04-09   15            2
## 226  2010-06-04   23            2
## 271  2010-07-02   27            0
## 316  2010-07-09   28            2
## 361  2010-09-10   37            1
## 406  2010-10-01   40            0
## 451  2010-11-05   45            2
## 496  2010-11-19   47            0
## 541  2010-11-26   48            1
## 586  2010-12-03   49            0
## 631  2010-12-17   51            0
## 676  2010-12-24   52            0
## 721  2010-12-31   53            1
## 766  2011-02-11    6            1
## 811  2011-02-18    7            2
## 856  2011-04-08   14            0
## 901  2011-04-29   17            2
## 946  2011-06-03   22            2
## 991  2011-06-10   23            0
## 1036 2011-07-08   27            2
## 1081 2011-09-09   36            1
## 1126 2011-10-07   40            0
## 1171 2011-11-04   44            2
## 1216 2011-11-11   45            0
## 1261 2011-11-25   47            1
## 1306 2011-12-02   48            0
## 1351 2011-12-09   49            0
## 1396 2011-12-23   51            0
## 1441 2011-12-30   52            1
## 1486 2012-02-10    6            1
## 1531 2012-02-17    7            2
## 1576 2012-04-06   14            0
## 1621 2012-04-13   15            2
## 1666 2012-06-01   22            2
## 1711 2012-06-08   23            0
## 1756 2012-07-06   27            2
## 1801 2012-09-07   36            1
## 1846 2012-10-05   40            0
# https://search.r-project.org/CRAN/refmans/IDPmisc/html/peaks.html
store_data %>% 
  ggplot()+geom_point(aes(x=Temperature,y=Weekly_Sales,color=factor(Store)))

store_data %>% 
  ggplot()+geom_point(aes(x=Fuel_Price,y=Weekly_Sales,color=factor(Store)))

store_data %>% 
  ggplot()+geom_point(aes(x=CPI,y=Weekly_Sales,color=factor(Store)))

roundedstore_data = store_data %>% 
  mutate(
    RoundedTemp=round(Temperature,digits=-1),
    RoundedCPI=round(CPI,digits=-1),
    RoundedFuel=round(Fuel_Price,digits=1),
    RoundedUnemployment=round(Unemployment,digits=1)
  )

roundedstore_data %>%
  group_by(Store) %>% 
  ggplot(aes(x=Store,y=Weekly_Sales)) + geom_point(aes(color=factor(RoundedTemp)))

roundedstore_data %>%
  group_by(Store) %>% 
  ggplot(aes(x=RoundedTemp,y=Weekly_Sales)) + geom_point(aes(color=factor(Store)))

roundedstore_data %>%
  group_by(RoundedTemp) %>% 
  summarize(AvgSales=mean(Weekly_Sales)) %>% 
  ggplot(aes(x=RoundedTemp,y=AvgSales)) + geom_bar(stat = "identity")

roundedstore_data %>%
  group_by(RoundedCPI) %>% 
  summarize(AvgSales=mean(Weekly_Sales)) %>% 
  ggplot(aes(x=RoundedCPI,y=AvgSales)) + geom_bar(stat = "identity")

roundedstore_data %>%
  group_by(RoundedFuel) %>% 
  summarize(AvgSales=mean(Weekly_Sales)) %>% 
  ggplot(aes(x=RoundedFuel,y=AvgSales)) + geom_bar(stat = "identity")

roundedstore_data %>%
  group_by(RoundedUnemployment) %>% 
  summarize(AvgSales=mean(Weekly_Sales)) %>% 
  ggplot(aes(x=RoundedUnemployment,y=AvgSales)) + 
    geom_bar(stat = "identity") + 
    geom_smooth(method="lm", aes(color="red")) +
    theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

store_data %>% 
  group_by(Store) %>% 
  summarize(CPI=mean(CPI),Unemployment=mean(Unemployment),Fuel_Price=mean(Fuel_Price),Weekly_Sales=mean(Weekly_Sales)) %>% 
  arrange(desc(Weekly_Sales))
## # A tibble: 45 × 5
##    Store   CPI Unemployment Fuel_Price Weekly_Sales
##    <int> <dbl>        <dbl>      <dbl>        <dbl>
##  1    20  209.         7.37       3.42     2107677.
##  2     4  129.         5.96       3.22     2094713.
##  3    14  186.         8.65       3.42     2020978.
##  4    13  129.         7.00       3.29     2003620.
##  5     2  216.         7.62       3.22     1925751.
##  6    10  129.         8.36       3.58     1899425.
##  7    27  139.         8.00       3.58     1775216.
##  8     6  218.         6.61       3.22     1564728.
##  9     1  216.         7.61       3.22     1555264.
## 10    39  215.         7.87       3.22     1450668.
## # ℹ 35 more rows
store_data_with_holidays %>% 
  group_by(Date) %>% 
  ggplot(aes(x=Date,y=Weekly_Sales))+geom_point(aes(color=factor(Holiday_Flag)))

store_data_with_holidays %>% 
  group_by(Store) %>% 
  ggplot(aes(x=factor(Store),y=Weekly_Sales))+geom_point(aes(color=factor(Holiday_Flag)))

rows = store_data %>% 
  group_by(Store,Year) %>% 
  summarize(n=n())
## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.
high_low = store_data %>% 
  left_join(rows,by=c('Store','Year')) %>% 
  group_by(Store,Year) %>% 
  mutate(rank=min_rank(-Weekly_Sales)) %>% 
  filter(rank==1 | rank==n) %>% 
  arrange(Store, n) %>% 
  mutate(high=ifelse(rank==1,1,0))

high_low %>% 
  ggplot(aes(x=Date,y=Weekly_Sales)) + geom_point(aes(color=factor(high), shape=factor(Holiday_Flag))) +
  labs(title="Date vs Weekly Sales to Show Most/Least Sales, Accounting for Holidays")

yearly_sales = store_data_with_holidays %>% 
  group_by(Store, Year) %>% 
  summarize(Yearly_Sales=sum(Weekly_Sales))
## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.
store_data_with_holidays = store_data_with_holidays %>% 
  left_join(yearly_sales, by=c("Store","Year")) %>% 
  mutate(Percentage_Sales=Weekly_Sales/Yearly_Sales)

head(store_data_with_holidays)
##   Store       Date Weekly_Sales Holiday_Flag Temperature Fuel_Price      CPI
## 1     1 2010-02-05    1643690.9            0       42.31      2.572 211.0964
## 2     2 2010-02-05    2136989.5            0       40.19      2.572 210.7526
## 3     3 2010-02-05     461622.2            0       45.71      2.572 214.4249
## 4     4 2010-02-05    2135143.9            0       43.76      2.598 126.4421
## 5     5 2010-02-05     317173.1            0       39.70      2.572 211.6540
## 6     6 2010-02-05    1652635.1            0       40.43      2.572 212.6224
##   Unemployment Year Month Day Week Yearly_Sales Percentage_Sales
## 1        8.106 2010     2   5    6     73278832       0.02243064
## 2        8.324 2010     2   5    6     95277864       0.02242902
## 3        7.368 2010     2   5    6     18745419       0.02462587
## 4        8.623 2010     2   5    6     95680471       0.02231536
## 5        6.566 2010     2   5    6     14836031       0.02137857
## 6        7.259 2010     2   5    6     76912321       0.02148726
store_data_with_holidays %>% 
  ggplot()+geom_point(aes(x=Temperature,y=Percentage_Sales,color=factor(Store)))

store_data_with_holidays %>% 
  ggplot()+geom_point(aes(x=CPI,y=Percentage_Sales,color=factor(Store)))

store_data_with_holidays %>% 
  ggplot()+geom_point(aes(x=Fuel_Price,y=Percentage_Sales,color=factor(Store)))

store_data_with_holidays %>% 
  ggplot()+geom_point(aes(x=Unemployment,y=Percentage_Sales,color=factor(Store)))

store_data_with_holidays %>% 
  group_by(Store) %>% 
  ggplot(aes(x=factor(Store),y=Percentage_Sales))+geom_point(aes(color=factor(Holiday_Flag)))

rounded_store_data_with_holidays = store_data_with_holidays %>% 
  mutate(
    RoundedTemp=round(Temperature,digits=-1),
    RoundedCPI=round(CPI,digits=-1),
    RoundedFuel=round(Fuel_Price,digits=1),
    RoundedUnemployment=round(Unemployment,digits=1)
  )

rounded_store_data_with_holidays %>%
  group_by(RoundedUnemployment) %>% 
  summarize(AvgSales=mean(Percentage_Sales)) %>% 
  ggplot(aes(x=RoundedUnemployment,y=AvgSales)) + 
    geom_bar(stat = "identity") + 
    geom_smooth(method="lm", aes(color="red")) +
    theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

rounded_store_data_with_holidays %>%
  group_by(RoundedTemp) %>% 
  summarize(AvgSales=mean(Percentage_Sales)) %>% 
  ggplot(aes(x=RoundedTemp,y=AvgSales)) + 
    geom_bar(stat = "identity") + 
    geom_smooth(method="lm", aes(color="red")) +
    theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

rounded_store_data_with_holidays %>%
  group_by(RoundedCPI) %>% 
  summarize(AvgSales=mean(Percentage_Sales)) %>% 
  ggplot(aes(x=RoundedCPI,y=AvgSales)) + 
    geom_bar(stat = "identity") + 
    geom_smooth(method="lm", aes(color="red")) +
    theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

rounded_store_data_with_holidays %>%
  group_by(RoundedFuel) %>% 
  summarize(AvgSales=mean(Percentage_Sales)) %>% 
  ggplot(aes(x=RoundedFuel,y=AvgSales)) + 
    geom_bar(stat = "identity") + 
    geom_smooth(method="lm", aes(color="red")) +
    theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'